+– 2020-google-user-study.Rproj +– plots <- folder for final and polished charts and maps +– proc_data <- folder for all data that has been processed or saved from the raw files +– raw_data <- folder for raw data files +– README.md -- rmd +– analysis.Rmd <- file for modeling data and polishing charts -- final.Rmd <- final file with polished graphs and results description
if (!requireNamespace("waffle")) install.packages("waffle")## packages
library(tidyverse)
library(tidytext)
library(colorspace)
library(sf)
library(maps)
library(albersusa) # remotes::install_github("hrbrmstr/albersusa")
library(rgeocodio) # remotes::install_github("hrbrmstr/rgeocodio")
library(geojsonio)
library(rgeos)
library(ggwordcloud)
library(patchwork)
library(pdftools)
library(showtext)
## save plots?
save <- TRUE
#save <- FALSE
## quality of png's
dpi <- 750
## font
font_add_google("Montserrat", "Montserrat")
font_add_google("Overpass", "Overpass")
font_add_google("Overpass Mono", "Overpass Mono")
## theme updates
theme_set(ggthemes::theme_clean(base_size = 15, base_family = "Montserrat"))
theme_update(plot.margin = margin(30, 30, 30, 30),
plot.background = element_rect(color = "white",
fill = "white"),
plot.title = element_text(size = 22,
face = "bold",
lineheight = 1.05,
hjust = .5,
margin = margin(10, 0, 25, 0)),
plot.title.position = "plot",
plot.caption = element_text(color = "grey40",
size = 9,
margin = margin(20, 0, -20, 0)),
plot.caption.position = "plot",
axis.line.x = element_line(color = "black",
size = .8),
axis.line.y = element_line(color = "black",
size = .8),
axis.title.x = element_text(size = 16,
face = "bold",
margin = margin(t = 20)),
axis.title.y = element_text(size = 16,
face = "bold",
margin = margin(r = 20)),
axis.text = element_text(size = 11,
color = "black",
face = "bold"),
axis.text.x = element_text(margin = margin(t = 10)),
axis.text.y = element_text(margin = margin(r = 10)),
axis.ticks = element_blank(),
panel.grid.major.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.major.y = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.x = element_line(size = .6,
color = "#eaeaea",
linetype = "solid"),
panel.grid.minor.y = element_blank(),
panel.spacing.x = unit(4, "lines"),
panel.spacing.y = unit(2, "lines"),
legend.position = "top",
legend.title = element_text(family = "Montserrat",
color = "black",
size = 14,
margin = margin(5, 0, 5, 0)),
legend.text = element_text(family = "Montserrat",
color = "black",
size = 11,
margin = margin(4.5, 4.5, 4.5, 4.5)),
legend.background = element_rect(fill = NA,
color = NA),
legend.key = element_rect(color = NA, fill = NA),
#legend.key.width = unit(5, "lines"),
#legend.spacing.x = unit(.05, "pt"),
#legend.spacing.y = unit(.55, "pt"),
#legend.margin = margin(0, 0, 10, 0),
strip.text = element_text(face = "bold",
margin = margin(b = 10)))
## theme settings for flipped plots
theme_flip <-
theme(panel.grid.minor.x = element_blank(),
panel.grid.minor.y = element_line(size = .6,
color = "#eaeaea"))
## theme settings for maps
theme_map <-
theme_void(base_family = "Montserrat") +
theme(legend.direction = "horizontal",
legend.box = "horizontal",
legend.margin = margin(10, 10, 10, 10),
legend.title = element_text(size = 17,
face = "bold"),
legend.text = element_text(color = "grey33",
size = 12),
plot.margin = margin(15, 5, 15, 5),
plot.title = element_text(face = "bold",
size = 20,
hjust = .5,
margin = margin(30, 0, 10, 0)),
plot.subtitle = element_text(face = "bold",
color = "grey33",
size = 17,
hjust = .5,
margin = margin(10, 0, -30, 0)),
plot.caption = element_text(size = 14,
color = "grey33",
hjust = .97,
margin = margin(-30, 0, 0, 0)))
## numeric format for labels
num_format <- scales::format_format(big.mark = ",", small.mark = ",", scientific = F)
## main color backlinko
bl_col <- "#00d188"
## colors + labels for interval stripes
int_cols <- c("#bce2d5", "#79d8b6", bl_col, "#009f66", "#006c45", "#003925")
int_perc <- c("100%", "95%", "75%", "50%", "25%", "5%")
## colors for degrees (Bachelors, Massters, Doctorate in reverse order)
cols_degree <- c("#e64500", "#FFCC00", darken(bl_col, .1))
## gradient colors for position
colfunc <- colorRampPalette(c(bl_col, "#bce2d5"))
pos_cols <- colfunc(10)#saving plots function example
if (save == T){
ggsave(here::here("plots", "1_1_jobs_word.pdf"), width = 12, height = 11, device = cairo_pdf)sheets <- readRDS(here::here("proc_data/fetched_data.RDS"))Check consistency of column names
name_summary <-
sheets %>%
map_dfr(~ tibble(nm = colnames(.)), .id = "sheet_name") %>%
spread(nm, nm) %>%
mutate_at(-1, ~ !is.na(.))
inconsistent_cols <- Filter(Negate(all), name_summary[-1])
# show columns that are not on every question
inconsistent_cols## # A tibble: 7 x 5
## ...19 `Clicks on orga~ `Clicks on orga~ `Number of diff~ `Number of page~
## <lgl> <lgl> <lgl> <lgl> <lgl>
## 1 TRUE TRUE FALSE TRUE FALSE
## 2 TRUE FALSE TRUE TRUE FALSE
## 3 FALSE FALSE TRUE FALSE TRUE
## 4 FALSE TRUE FALSE FALSE TRUE
## 5 FALSE FALSE TRUE FALSE TRUE
## 6 TRUE TRUE FALSE FALSE TRUE
## 7 FALSE TRUE FALSE FALSE TRUE
We need to fix typos and harmonise some names, we also remove the col “..19” as it’s just an unnamed empty column.
for (nm in names(sheets)) {
sheets[[nm]][["...19"]] <- NULL
names(sheets[[nm]]) <- sub("organis ", "organic ", names(sheets[[nm]]))
names(sheets[[nm]]) <- sub("different ULRs/pages", "pages", names(sheets[[nm]]))
}
# now we try again
name_summary <-
sheets %>%
map_dfr(~ tibble(nm = colnames(.)), .id = "sheet_name") %>%
spread(nm, nm) %>%
mutate_at(-1, ~ !is.na(.))
inconsistent_cols <- Filter(Negate(all), name_summary[-1])
if (ncol(inconsistent_cols)) stop("we still have inconsistent column names")Now that we have consistent column names, we must ensure we have consistent type.
type_summary <-
sheets %>%
map_dfr(summarize_all, typeof, .id = "sheet_name")
inconsistent_cols <- Filter(function(x) n_distinct(x) != 1, type_summary[-1])
if (ncol(inconsistent_cols)) stop("we have inconsistent columns types")We can now concatenate the data of all questions in a single table
full_data <- bind_rows(sheets, .id = "question")
# remove row containing only NAs appart from é first columns
full_data <- full_data[rowSums(is.na(full_data)) != ncol(full_data) - 2,]
# convert Y/N/y/n to TRUE/FALSE
full_data <- modify_if(full_data, ~all(tolower(unique(.)) %in% c("y","n")), ~ tolower(.) == "y")We observe that item “If clicked on Google Maps/local listing, did they click on one of the first three listings?” was assigned “N” when it is not relevant (no click on google map listing) so we fix by setting those to NA.
full_data[["If clicked on Google Maps/local listing, did they click on one of the first three listings?"]] <-
ifelse(full_data[["Clicks on Google Maps/local listings"]],
full_data[["If clicked on Google Maps/local listing, did they click on one of the first three listings?"]],
NA)
saveRDS(full_data, here::here("proc_data/clean_data.RDS"))We go through all items and drive out key insights.
plots <- list()item <- "When typing the search query, did they select one of the suggested searches? (Y/N)"
if (names(full_data)[[3]] != item)
stop("wrong order of columns")
plots[[item]] <- full_data %>%
group_by(question) %>%
summarize_at(item, ~100*mean(.)) %>%
arrange_at(item) %>%
mutate(fill = "a") %>%
bind_rows(summarise_at(., 2, mean) %>% mutate(question = "Overall", fill = "b")) %>%
mutate(question = factor(question, question)) %>%
ggplot() +
geom_col(aes(
question,
!!sym(item),
fill = fill)) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 30, hjust = 1)) +
ggtitle(item) +
labs(x = "", y = "% of users who selected a suggestion")
plots[[item]]item <- "Seconds to first click (use check time stamping on video)"
if (names(full_data)[[4]] != item)
stop("wrong order of columns")
full_data %>%
group_by_at(4) %>%
summarize(n = n()) %>%
ggplot(aes(!!sym(item), n)) +
geom_point() +
geom_line() +
ggtitle("raw chart")plot_data <- full_data %>%
group_by_at(4) %>%
summarize(pct = n()/nrow(.)) %>%
mutate_at("pct", cumsum)
# quantile data with labels
q_data = tibble(
q = quantile(full_data[[item]], c(.25,.5,.75)),
x = approx(plot_data$pct, plot_data[[item]], xout = c(.25,.5,.75))$y,
y = c(.25,.50,.75),
lab = sprintf("%s%% click before %s sec", y*100, q))
plots[[item]] <- plot_data %>%
ggplot(aes(!!sym(item), pct)) +
geom_line() +
coord_cartesian(xlim = c(0,50)) +
ggtitle("Time to first click") +
labs(x = "time (seconds)", y = "% of users who click before time") +
geom_point(aes(x,y), data =q_data, size = 4) +
geom_text(aes(x,y, label = lab), data = q_data, hjust = 0, nudge_x = 3) +
scale_y_continuous(labels = function(x) scales::percent(x,1))
plots[[item]]item <- "Did they scroll to the end of the Google Search Page (Y/N)?"
if (names(full_data)[[5]] != item)
stop("wrong order of columns")
plots[[item]] <- full_data %>%
group_by(question) %>%
summarize_at(item, ~100*mean(.)) %>%
arrange_at(item) %>%
mutate(fill = "a") %>%
bind_rows(summarise_at(., 2, mean) %>% mutate(question = "Overall", fill = "b")) %>%
mutate(question = factor(question, question)) %>%
ggplot() +
geom_col(aes(
question,
!!sym(item),
fill = fill)) +
theme(
legend.position = "none",
axis.text.x = element_text(angle = 30, hjust = 1)) +
ggtitle(item) +
labs(x = "", y = "% of users who scrolled to the bottom")
plots[[item]]item <- "Times changed/modified search query"
if (names(full_data)[[6]] != item)
stop("wrong order of columns")
table(full_data[[6]])##
## 0 1 2 3 4 5 7 11
## 1539 193 39 13 9 3 2 1
plots[[item]] <- full_data %>%
transmute_at(item, ~ifelse(.>1, "2+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of changes", y = "% of users")
plots[[item]]item <- "Times searcher clicks a result, then bounces back to the search results page and chooses a different result"
table(full_data[[7]])##
## 0 1 2 3 4 5 6 8 9 11
## 1493 209 66 20 5 2 1 2 1 1
plots[[item]] <- full_data %>%
transmute_at(item, ~ifelse(.>1, "2+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks followed by bounces", y = "% of users")
plots[[item]]item <- "Number of pages visited"
table(full_data[[item]])##
## 0 1 2 3 4 5 6 7 8 9 10 15
## 87 1054 410 145 47 29 17 3 6 1 1 1
plots[[item]] <- full_data %>%
transmute_at(item, ~ifelse(.>=4, "4+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of pages", y = "% of users")
plots[[item]]item <- "Clicks on organic search results"
table(full_data[[item]])##
## 0 1 2 3 4 5 6 7 8 9 13
## 625 875 175 70 24 19 7 3 1 1 1
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=3, "3+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]]item <- "Clicks on Google AdWords/paid listings"
table(full_data[[item]])##
## 0 1 2 3 4 5 6 7 10
## 1456 272 46 15 2 6 2 1 1
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=2, "2+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]] item <- "Clicks on video results"
table(full_data[[item]])##
## 0 1 2 3 4 6
## 1603 173 15 7 2 1
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=2, "2+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]] item <- "Clicks on Google Maps/local listings"
table(full_data[[item]])##
## 0 1 2 3 4 5
## 1684 62 36 13 5 1
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=2, "2+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]]item <- "If clicked on Google Maps/local listing, did they click on one of the first three listings?"
table(full_data[[item]], full_data[["question"]], useNA = "always")##
## Q1_Commercial_1_pyhsical Q2_Commercial_2_service Q3_Local
## FALSE 0 0 3
## TRUE 0 0 105
## <NA> 256 257 150
##
## Q4_Informational_1_video Q5_Informational_2_indeepth
## FALSE 1 0
## TRUE 0 0
## <NA> 257 259
##
## Q6_transactional Q7_Commercial_3_service <NA>
## FALSE 0 0 0
## TRUE 8 0 0
## <NA> 249 256 0
full_data %>%
group_by(question) %>%
summarize(clickers = sum(`Clicks on Google Maps/local listings` != 0), first3 = 100*mean(!!sym(item), na.rm = TRUE))## # A tibble: 7 x 3
## question clickers first3
## <chr> <int> <dbl>
## 1 Q1_Commercial_1_pyhsical 0 NaN
## 2 Q2_Commercial_2_service 0 NaN
## 3 Q3_Local 108 97.2
## 4 Q4_Informational_1_video 1 0
## 5 Q5_Informational_2_indeepth 0 NaN
## 6 Q6_transactional 8 100
## 7 Q7_Commercial_3_service 0 NaN
full_data %>%
summarize(clickers = sum(`Clicks on Google Maps/local listings` != 0), first3 = 100*mean(!!sym(item), na.rm = TRUE))## # A tibble: 1 x 2
## clickers first3
## <int> <dbl>
## 1 117 96.6
We don’t have reliable numbers here for any conclusion
item <- "Clicks on Image blocks"
table(full_data[[item]], useNA = "always")##
## 0 1 2 4 <NA>
## 1792 5 2 1 1
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]]Here let’s make sure that the instruction and measurement were ok because number is really low
item <- "Clicks on Google Shopping results"
table(full_data[[item]], useNA = "always")##
## 0 1 2 3 5 6 9 14 <NA>
## 1739 31 17 10 1 1 1 1 0
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]]item <- "Clicks \"people also ask\" box"
table(full_data[[item]], useNA = "always")##
## 0 1 2 3 4 7 12 <NA>
## 1748 29 15 5 2 1 1 0
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]]item <- "Clicks to 2nd+ Google page"
table(full_data[[item]], useNA = "always")##
## 0 1 2 3 4 <NA>
## 1793 3 2 1 2 0
plots[[item]] <-
full_data %>%
transmute_at(item, ~ifelse(.>=1, "1+", .)) %>%
group_by_at(item) %>%
count() %>%
ungroup() %>%
mutate(n = n/sum(n)) %>%
na.omit() %>%
ggplot(aes(!!sym(item),n)) +
geom_col() +
ggtitle(item) +
scale_y_continuous(labels = function(x) scales::percent(x,1)) +
labs(x = "number of clicks", y = "% of users")
plots[[item]] item <- "Seconds to complete search task"
full_data %>%
group_by_at(item) %>%
summarize(n = n()) %>%
ggplot(aes(!!sym(item), n)) +
geom_point() +
geom_line() +
ggtitle("raw chart")# quantile data with labels
plot_data <- full_data %>%
group_by_at(item) %>%
summarize(pct = n()/nrow(.)) %>%
mutate_at("pct", cumsum)
q_data = tibble(
q = quantile(full_data[[item]], c(.25,.5,.75)),
x = approx(plot_data$pct, plot_data[[item]], xout = c(.25,.5,.75))$y,
y = c(.25,.50,.75),
lab = sprintf("%s%% complete before %s sec", y*100, q))
plots[[item]] <-
plot_data %>%
ggplot(aes(!!sym(item), pct)) +
geom_line() +
coord_cartesian(xlim = c(0,300)) +
ggtitle("Time to complete a search task") +
labs(x = "time (seconds)", y = "% of users who complete the task before time") +
geom_point(aes(x, y), data =q_data, size = 4) +
geom_text(aes(x, y, label = lab), data = q_data, hjust = 0, nudge_x = 3) +
scale_y_continuous(labels = function(x) scales::percent(x,1))
plots[[item]]item <- "Total Clicks"
table(full_data[[item]], useNA = "always")##
## 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14
## 5 98 185 252 160 246 155 140 101 87 66 48 48 32 33
## 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29
## 28 16 20 9 12 2 8 5 2 1 8 2 5 4 2
## 30 31 32 33 34 35 36 37 38 41 60 86 94 <NA>
## 3 2 2 1 3 2 1 2 1 1 1 1 1 0
# quantile data with labels
plot_data <- full_data %>%
group_by_at(item) %>%
summarize(pct = n()/nrow(.)) %>%
mutate_at("pct", cumsum)
q_data = tibble(
q = quantile(full_data[[item]], c(.25,.5,.75)),
x = approx(plot_data$pct, plot_data[[item]], xout = c(.25,.5,.75))$y,
y = c(.25,.50,.75),
lab = sprintf("%s%% click less than %s times", y*100, q))
plots[[item]] <-
plot_data %>%
ggplot(aes(!!sym(item), pct)) +
geom_line() +
#geom_point() +
coord_cartesian(xlim = c(0,50)) +
ggtitle("Total number of clicks") +
labs(x = "number of clicks", y = "% of users who clicked less than the given numeber of clicks") +
geom_point(aes(x, y), data = q_data, size = 4) +
geom_text(aes(x, y, label = lab), data = q_data, hjust = 0, nudge_x = 3) +
scale_y_continuous(labels = function(x) scales::percent(x,1))
plots[[item]]saveRDS(plots, here::here("proc_data/plots.RDS"))